Copyright 2019 Google LLC

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at

https://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.

Generating the cell lines data

We download the data from https://github.com/LuyiTian/sc_mixology/tree/master/data/csv and copy it to the adenocarcinoma folder (where the notebook is launched).


In [1]:
library(SingleCellExperiment)
library(scran)
library(scater)
library(Seurat)


Loading required package: SummarizedExperiment
Loading required package: GenomicRanges
Loading required package: stats4
Loading required package: BiocGenerics
Loading required package: parallel

Attaching package: 'BiocGenerics'

The following objects are masked from 'package:parallel':

    clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
    clusterExport, clusterMap, parApply, parCapply, parLapply,
    parLapplyLB, parRapply, parSapply, parSapplyLB

The following objects are masked from 'package:stats':

    IQR, mad, sd, var, xtabs

The following objects are masked from 'package:base':

    Filter, Find, Map, Position, Reduce, anyDuplicated, append,
    as.data.frame, basename, cbind, colMeans, colSums, colnames,
    dirname, do.call, duplicated, eval, evalq, get, grep, grepl,
    intersect, is.unsorted, lapply, lengths, mapply, match, mget,
    order, paste, pmax, pmax.int, pmin, pmin.int, rank, rbind,
    rowMeans, rowSums, rownames, sapply, setdiff, sort, table, tapply,
    union, unique, unsplit, which, which.max, which.min

Loading required package: S4Vectors

Attaching package: 'S4Vectors'

The following object is masked from 'package:base':

    expand.grid

Loading required package: IRanges
Loading required package: GenomeInfoDb
Loading required package: Biobase
Welcome to Bioconductor

    Vignettes contain introductory material; view with
    'browseVignettes()'. To cite Bioconductor, see
    'citation("Biobase")', and for packages 'citation("pkgname")'.

Loading required package: DelayedArray
Loading required package: matrixStats

Attaching package: 'matrixStats'

The following objects are masked from 'package:Biobase':

    anyMissing, rowMedians

Loading required package: BiocParallel

Attaching package: 'DelayedArray'

The following objects are masked from 'package:matrixStats':

    colMaxs, colMins, colRanges, rowMaxs, rowMins, rowRanges

The following objects are masked from 'package:base':

    aperm, apply

Loading required package: ggplot2

Attaching package: 'scater'

The following object is masked from 'package:S4Vectors':

    rename

The following object is masked from 'package:stats':

    filter


In [2]:
run_qc <- function(sce) {
  isSpike(sce, "ERCC") <- grepl("^ERCC", rownames(sce))
  sce <- calculateQCMetrics(sce)

  # Identify outliers, but without using the mouse as a batch
  libsize.drop <- isOutlier(sce$total_counts, nmads=3, type="lower", log=TRUE)
  feature.drop <- isOutlier(sce$total_features_by_counts, nmads=3, type="lower", log=TRUE)
  spike.drop <- isOutlier(sce$pct_counts_ERCC, nmads=3, type="higher")
  keep <- !(libsize.drop | feature.drop | spike.drop)
  sce <- sce[,keep]

  num.cells <- nexprs(sce, byrow=TRUE)
  to.keep <- num.cells > 0
  print(sum(!to.keep))
  sce <- sce[to.keep,]
  sce
}

In [3]:
tissue = 'sc_celseq2_5cl_p1'
path <- paste('adenocarcinoma', tissue, sep='/')
counts <- read.csv(paste(path, 'count', 'csv', sep='.'))
metadata <- read.csv(paste(path, 'metadata', 'csv', sep='.'))
sce_p1 <- SingleCellExperiment(assays = list(counts = as.matrix(counts)),
                            colData = as.data.frame(metadata))

tissue = 'sc_celseq2_5cl_p2'
path <- paste('adenocarcinoma', tissue, sep='/')
counts <- read.csv(paste(path, 'count', 'csv', sep='.'))
metadata <- read.csv(paste(path, 'metadata', 'csv', sep='.'))
sce_p2 <- SingleCellExperiment(assays = list(counts = as.matrix(counts)),
                            colData = as.data.frame(metadata))

tissue = 'sc_celseq2_5cl_p3'
path <- paste('adenocarcinoma', tissue, sep='/')
counts <- read.csv(paste(path, 'count', 'csv', sep='.'))
metadata <- read.csv(paste(path, 'metadata', 'csv', sep='.'))
sce_p3 <- SingleCellExperiment(assays = list(counts = as.matrix(counts)),
                            colData = as.data.frame(metadata))

universe <- intersect(intersect(rownames(sce_p1), rownames(sce_p2)), rownames(sce_p3))
sce_p1 <- sce_p1[universe,]
sce_p2 <- sce_p2[universe,]
sce_p3 <- sce_p3[universe,]

colnames(sce_p2) <- sub('p1', 'p2', colnames(sce_p2))
colnames(sce_p3) <- sub('p1', 'p3', colnames(sce_p3))

sce_celseq2_5cl <- cbind(sce_p1, sce_p2, sce_p3)
sce_celseq2_5cl <- run_qc(sce_celseq2_5cl)

colData(sce_celseq2_5cl)$label <- colData(sce_celseq2_5cl)$cell_line_demuxlet
saveRDS(sce_celseq2_5cl, 'adenocarcinoma/sce/sc_celseq2_5cl.rds')
name = 'sc_celseq2_5cl'
write.csv(as.matrix(counts(sce_celseq2_5cl)), paste(name, 'counts.csv', sep='.'))
write.csv(colData(sce_celseq2_5cl), paste(name, 'metadata.csv', sep='.'))
write.csv(rowData(sce_celseq2_5cl), paste(name, 'featuredata.csv', sep='.'))


[1] 0

In [4]:
tissue = 'sc_celseq2'
path <- paste('adenocarcinoma', tissue, sep='/')
counts <- read.csv(paste(path, 'count', 'csv', sep='.'))
metadata <- read.csv(paste(path, 'metadata', 'csv', sep='.'))
sce_celseq2 <- SingleCellExperiment(assays = list(counts = as.matrix(counts)),
                            colData = as.data.frame(metadata))
sce_celseq2$label <- sce_celseq2$cell_line_demuxlet
saveRDS(sce_celseq2, 'adenocarcinoma/sce/sc_celseq2.rds')
name = 'sc_celseq2'
write.csv(as.matrix(counts(sce_celseq2)), paste(name, 'counts.csv', sep='.'))
write.csv(colData(sce_celseq2), paste(name, 'metadata.csv', sep='.'))
write.csv(rowData(sce_celseq2), paste(name, 'featuredata.csv', sep='.'))

In [5]:
sce_celseq2_5cl


class: SingleCellExperiment 
dim: 12653 895 
metadata(0):
assays(1): counts
rownames(12653): ENSG00000154529 ENSG00000215375 ... ENSG00000137413
  ENSG00000167157
rowData names(8): is_feature_control is_feature_control_ERCC ...
  total_counts log10_total_counts
colnames(895): p1_A1 p1_A10 ... p3_P8 p3_P9
colData names(54): unaligned aligned_unmapped ...
  pct_counts_in_top_500_features_ERCC label
reducedDimNames(0):
spikeNames(1): ERCC

In [15]:
tissue = 'sc_10x'
path <- paste('adenocarcinoma', tissue, sep='/')
counts <- read.csv(paste(path, 'count', 'csv', sep='.'))
metadata <- read.csv(paste(path, 'metadata', 'csv', sep='.'))
sce_10x <- SingleCellExperiment(assays = list(counts = as.matrix(counts)),
                            colData = as.data.frame(metadata))
sce_10x$label <- sce_10x$cell_line_demuxlet
saveRDS(sce_10x, 'adenocarcinoma/sce/sc_10x.rds')
name = 'sc_10x'
write.csv(as.matrix(counts(sce_10x)), paste(name, 'counts.csv', sep='.'))
write.csv(colData(sce_10x), paste(name, 'metadata.csv', sep='.'))
write.csv(rowData(sce_10x), paste(name, 'featuredata.csv', sep='.'))

In [13]:
tissue = 'sc_10x_5cl'
path <- paste('adenocarcinoma', tissue, sep='/')
counts <- read.csv(paste(path, 'count', 'csv', sep='.'))
metadata <- read.csv(paste(path, 'metadata', 'csv', sep='.'))
sce_10x_5cl <- SingleCellExperiment(assays = list(counts = as.matrix(counts)),
                            colData = as.data.frame(metadata))
sce_10x_5cl$label <- sce_10x$cell_line_demuxlet
saveRDS(sce_10x, 'adenocarcinoma/sce/sc_10x_5cl.rds')
name = 'sc_10x_5cl'
write.csv(as.matrix(counts(sce_10x)), paste(name, 'counts.csv', sep='.'))
write.csv(colData(sce_10x), paste(name, 'metadata.csv', sep='.'))
write.csv(rowData(sce_10x), paste(name, 'featuredata.csv', sep='.'))

In [11]:
ratio <- function(df) {
    table(df$label) / dim(df)[2]
}

In [21]:
ratio(sce_celseq2)
dim(sce_celseq2)[2]
ratio(sce_celseq2_5cl)
dim(sce_celseq2_5cl)[2]


    H1975     H2228    HCC827 
0.4087591 0.2956204 0.2956204 
274
     A549     H1975     H2228      H838    HCC827 
0.3586592 0.1430168 0.1474860 0.2178771 0.1329609 
895

In [23]:
ratio(sce_10x)
dim(sce_10x)[2]
ratio(sce_10x_5cl)
dim(sce_10x_5cl)[2]


    H1975     H2228    HCC827 
0.3470067 0.3481153 0.3048780 
902
     A549     H1975     H2228      H838    HCC827 
0.3205717 0.1123022 0.1934661 0.2235835 0.1500766 
3918